The datasets contains transactions made by credit cards in September 2013 by european cardholders.
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions.
The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
It contains only numerical input variables which are the result of a PCA transformation.
Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data.
Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'.
Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning.
Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.
import bhishan
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
SEED=100
np.random.seed(SEED)
# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.options.display.float_format = '{:,}'.format # df.A.value_counts().astype(float)
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
import IPython
from IPython.display import display, HTML, Image, Markdown
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
import scipy
from scipy import stats
# scale and split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
# dimension reduction for visualization
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# hyperparameters search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef
# pipelines
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
# prediction
from sklearn.model_selection import cross_val_predict
# model evaluation metrics
from sklearn.model_selection import cross_val_score
# roc auc etc scores
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
# roc auc curves
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
# confusion matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
def show_method_attributes(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def print_scores(ytest,ypreds):
print(f'Precision: {precision_score(ytest,ypreds): .2f}')
print(f'Recall : {recall_score(ytest,ypreds): .2f}')
print(f'F1-score : {f1_score(ytest,ypreds): .2f}')
c = classification_report(ytest, ypreds)
print(c)
cm = confusion_matrix(ytest,ypreds)
names = ['Not-converted','Converted']
df_cm = pd.DataFrame(cm,index=names,columns=names)
df_cm = df_cm.style.background_gradient()
display(df_cm)
df_eval = pd.DataFrame({'Model': [],
'Description':[],
'Accuracy':[],
'Precision':[],
'Recall':[],
'F1':[],
'AUC':[],
})
df = pd.read_csv('../data/raw/creditcard.csv.zip',compression='zip')
print(df.shape)
df.head()
from sklearn.model_selection import train_test_split
target = 'Class'
df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
df.drop(target,axis=1),
df[target],
test_size=0.2,
random_state=SEED,
stratify=df[target])
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_orig,
ser_ytrain_orig,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_orig)
print(df_Xtrain.shape)
df_Xtrain.head()
```python n_jobs=-1 random_state=None
learning_rate max_depth min_child_samples
reg_alpha reg_lambda subsample
min_child_weight min_split_gain n_estimators
num_leaves objective boost='gbdt' metric='auc'
import joblib
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score,precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score
# time
time_start = time.time()
model_name = 'lightgbm'
desc = 'default'
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()
# model
clf_lgb = lgbm.LGBMClassifier(random_state=SEED)
# fit and save the model
clf_lgb.fit(Xtr, ytr)
joblib.dump(clf_lgb,'../outputs/clf_lgb.pkl')
# load the saved model
clf_lgb = joblib.load('../outputs/clf_lgb.pkl')
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_lgb, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average=average),
recall_score(ytx, ypreds, average=average),
f1_score(ytx, ypreds, average=average),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
print_scores(ytest,ypreds)
cm = confusion_matrix(ytest, ypreds)
vals = cm.ravel()
print('lightGBM Results')
print('-'*25)
print('Total Frauds: ', vals[2] + vals[3])
print('Incorrect Frauds: ', vals[2])
print('Incorrect Percent: ', round(vals[2]*100/(vals[2]+vals[3]),2),'%')
from bhishan.util_plot_model_eval import plotly_binary_clf_evaluation
plotly_binary_clf_evaluation('lgbm with n_estimators = 100',clf_lgb,ytest,ypreds,yprobs,df)
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress
dtrain = lgb.Dataset(df_Xtrain, label= ser_ytrain)
def objective(trial):
params_lgb_optuna = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': 0,
'boosting_type': 'gbdt',
# lambda
'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
# leaves
'num_leaves': trial.suggest_int('num_leaves', 2, 256),
# fraction
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
'subsample': trial.suggest_uniform('subsample', 0.2, 1.0),
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
# child
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-5, 1e4),
}
booster_gbm = lgb.train(params_lgb_optuna, dtrain)
yscores = booster_gbm.predict(df_Xvalid)
ypreds = np.rint(yscores)
score = roc_auc_score(ser_yvalid.to_numpy().ravel(),
ypreds)
return score
# NOTE: there is inherent non-determinism in optuna hyperparameter selection
# we may not get the same hyperparameters when run twice.
sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 10 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='lgb_optuna',
storage='sqlite:///lgb_optuna_conversion_rate.db',
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS)
# Resume from last study
N_TRIALS = 100 # make it large
study = optuna.create_study(direction='maximize',
sampler=sampler,
study_name='lgb_optuna',
storage='sqlite:///lgb_optuna_conversion_rate.db',
load_if_exists=True)
study.optimize(objective, n_trials=N_TRIALS)
print(f'Number of finished trials: {len(study.trials)}')
# best trail
best_trial = study.best_trial
# best params
params_best = study.best_trial.params
params_best
# time
time_start = time.time()
model_name = 'lightgbm'
desc = 'grid search optuna'
Xtr = df_Xtrain_orig
ytr = ser_ytrain_orig.to_numpy().ravel()
Xtx = df_Xtest
ytx = ser_ytest.to_numpy().ravel()
Xvd = df_Xvalid
yvd = ser_yvalid.to_numpy().ravel()
# use best model
params_best = study.best_trial.params
clf_lgb = clf_lgb = lgbm.LGBMClassifier(random_state=SEED)
clf_lgb.set_params(**params_best)
# fit and save the model
clf_lgb.fit(Xtr, ytr)
joblib.dump(clf_lgb,'../outputs/clf_lgb_grid_search_optuna.pkl')
# load the saved model
clf_lgb = joblib.load('../outputs/clf_lgb_grid_search_optuna.pkl')
# predictions
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(clf_lgb, Xtx, ytx, cv=skf)
ypreds = ypreds_cv
# model evaluation
average = 'binary'
row_eval = [model_name,desc,
accuracy_score(ytx, ypreds),
precision_score(ytx, ypreds, average=average),
recall_score(ytx, ypreds, average=average),
f1_score(ytx, ypreds, average=average),
roc_auc_score(ytx, ypreds),
]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
df_eval.sort_values('Recall',ascending=False).style.background_gradient(subset='Recall')
cm = confusion_matrix(ytest,ypreds)
vals = cm.ravel()
cm
print('LightGBM Grid Search Results')
print('-'*25)
print('Total Frauds: ', vals[2] + vals[3])
print('Incorrect Frauds: ', vals[2])
print('Incorrect Percent: ', round(vals[2]*100/(vals[2]+vals[3]),2),'%')
plotly_binary_clf_evaluation('clf_lgb_optuna',clf_lgb,ytest,ypreds,yprobs,df)
arr_fimp = clf_lgb.feature_importances_
df_fimp = pd.DataFrame(arr_fimp,columns=['Importances'],index=df.columns.drop(target))
df_fimp = df_fimp.sort_values('Importances',ascending=False)
plt.figure(figsize=(12,8))
ax = sns.barplot(x=df_fimp.Importances, y= df_fimp.index);
for p in ax.patches:
x = p.get_width()
y = p.get_y()
text = '{:.2f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo',va='top',ha='left')
%%time
X = df.drop('Class',axis=1).to_numpy()
y = df['Class'].to_numpy()
scores = cross_val_score(clf_lgb,
X,y,
scoring ='f1',
cv=5,
n_jobs=-1,
verbose=2)
trace = go.Table(
header=dict(values=['<b>F1 score mean<b>', '<b>F1 score std<b>'],
line = dict(color='#7D7F80'),
fill = dict(color='#a1c3d1'),
align = ['center'],
font = dict(size = 15)),
cells=dict(values=[np.round(scores.mean(),6),
np.round(scores.std(),6)],
line = dict(color='#7D7F80'),
fill = dict(color='#EDFAFF'),
align = ['center'], font = dict(size = 15)))
layout = dict(width=800, height=500,
title = 'Cross validation - 5 folds [F1 score]',
font = dict(size = 15))
fig = dict(data=[trace], layout=layout)
py.iplot(fig, filename = '../reports/figures/lightgbm_cross_validation.html')
df.head(2)
import eli5
eli5.show_weights(clf_lgb)
from eli5.sklearn import PermutationImportance
feature_names = df_Xtrain.columns.tolist()
perm = PermutationImportance(clf_lgb).fit(df_Xtest, ytx)
eli5.show_weights(perm, feature_names=feature_names)
df.head(2)
idx = 0
example = df_Xtest.iloc[idx]
answer = ser_ytest.iloc[idx]
feature_names = df_Xtest.columns.tolist()
prediction = clf_lgb.predict(example.to_numpy().reshape(-1,1).T)
print(f'answer = {answer}')
print('prediction = ', prediction[0])
print()
print(example)
print(feature_names)
import lime
import lime.lime_tabular
# categorical_features = []
# categorical_features_idx = [df_Xtrain.columns.get_loc(col)
# for col in categorical_features]
NUM_FEATURES = len(feature_names)
explainer = lime.lime_tabular.LimeTabularExplainer(df_Xtrain.to_numpy(),
feature_names=feature_names,
class_names=['Not-fraud','Fraud'],
mode='classification')
exp = explainer.explain_instance(example, clf_lgb.predict_proba,
num_features=NUM_FEATURES)
exp.show_in_notebook(show_table=True)
ax = exp.as_pyplot_figure(); # use semicolon
ax.set_figheight(12);
# show_method_attributes(ax,start='set')
import shap
shap.initjs()
show_method_attributes(shap)
%%time
explainer = shap.TreeExplainer(clf_lgb)
shap_values = np.array(explainer.shap_values(df_Xtest))
# shap.force_plot?
df_Xtest.shape, explainer.expected_value, type(explainer.expected_value), len(explainer.expected_value)
idx = 5
shap.force_plot(explainer.expected_value[1],
shap_values[1][idx,:],
df_Xtest.iloc[idx,:] # this is just for giving feature names
)
# many points
NUM = 1000
shap.force_plot(explainer.expected_value[1],
shap_values[1][:NUM,:],
df_Xtest.iloc[:NUM,:] # this is just for giving feature names
)
shap.summary_plot(shap_values, df_Xtest)
# shap.dependence_plot?
shap_values = shap.TreeExplainer(clf_lgb).shap_values(df_Xtest)